/**
* Copyright (c) 2011, Regents of the University of Colorado
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of the University of Colorado at Boulder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.engine;

import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;

import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;

import clear.dep.DepNode;
import clear.dep.DepTree;
import clear.dep.srl.SRLHead;
import clear.dep.srl.SRLInfo;
import clear.morph.MorphEnAnalyzer;
import clear.propbank.PBArg;
import clear.propbank.PBInstance;
import clear.propbank.PBLib;
import clear.propbank.PBLoc;
import clear.propbank.PBReader;
import clear.treebank.TBEnConvert;
import clear.treebank.TBEnLib;
import clear.treebank.TBHeadRules;
import clear.treebank.TBNode;
import clear.treebank.TBReader;
import clear.treebank.TBTree;
import clear.util.IOUtil;
import clear.util.JSet;

import com.carrotsearch.hppc.IntOpenHashSet;

/**
 * Compare two dependency trees.
 * @author Jinho D. Choi
 * <b>Last update:</b> 4/26/2010
 */
public class PropToDep
{
	@Option(name="-i", usage="name of a file containing PropBank instances", required=true, metaVar="REQUIRED")
	String s_propFile;
	@Option(name="-o", usage="name of a directory for dependency output", required=true, metaVar="REQUIRED")
	String s_srlDir;
	@Option(name="-p", usage="name of a directory containing parse trees", required=true, metaVar="REQUIRED")
	String s_parseDir;
	@Option(name="-h", usage="name of a file containing head-percolation rules", required=true, metaVar="REQUIRED")
	String s_headruleFile;
	@Option(name="-m", usage="name of a file containing dictionaries for morphological analyzer", metaVar="OPTIONAL")
	String s_dictFile = null;
	@Option(name="-n", usage="minimum sentence length (inclusive; default = 1)", metaVar="OPTIONAL")
	int n_length = 1;
	@Option(name="-f", usage="if set, include function tags", metaVar="OPTIONAL")
	boolean b_funcTag = false;
	@Option(name="-e", usage="if set, include empty categories", metaVar="OPTIONAL")
	boolean b_ec = false;
	@Option(name="-r", usage="if set, reverse dependencies of auxiliaries and modals", metaVar="OPTIONAL")
	boolean b_reverseVC = false;
	@Option(name="-j", usage="if set, consider adjectival predicates as verbal predicates", metaVar="OPTIONAL")
	boolean b_adjPred = false;
	
//	final String PARSE_EXT = ".parse";
	final String PARSE_EXT = "";
	final String SRL_EXT   = ".srl";
	HashMap<String,PBInstance> m_pbInstances;
	
	public PropToDep(String[] args)
	{
		CmdLineParser cmd = new CmdLineParser(this);
		
		try
		{
			cmd.parseArgument(args);
			
			readPBInstances();
			merge();
		}
		catch (CmdLineException e)
		{
			System.err.println(e.getMessage());
			cmd.printUsage(System.err);
		}
	}
	
	/** Reads all PropBank instances and stores them to {@link PropToDep#m_pbInstances}. */
	public void readPBInstances()
	{
		PBReader   reader = new PBReader(s_propFile);
		PBInstance instance;
		
		m_pbInstances = new HashMap<String,PBInstance>();
		System.out.println("Initialize: "+s_propFile);
		
		while ((instance = reader.nextInstance()) != null)
		{
			if (instance.rolesetId.endsWith(".LV"))	continue;
			if (!instance.type.endsWith("-v"))		continue;
			
			m_pbInstances.put(instance.getKey(), instance);
		}
	}
	
	/** @return list of tree paths as in PropBank instance. */
	private ArrayList<String> getTreePaths()
	{
		HashSet<String> set = new HashSet<String>();
		
		for (String key : m_pbInstances.keySet())
			set.add(key.substring(0, key.indexOf(PBInstance.KEY_DELIM)));

		ArrayList<String> list = new ArrayList<String>(set);
		Collections.sort(list);
		
		return list;
	}
	
	/** @return list of PropBank instances for the tree. */
	private ArrayList<PBInstance> getPBInstances(String treePath, int treeIndex)
	{
		ArrayList<PBInstance> list = new ArrayList<PBInstance>();
		String prefix = treePath + PBInstance.KEY_DELIM + treeIndex + PBInstance.KEY_DELIM;
		
		for (String key : m_pbInstances.keySet())
		{
			if (key.startsWith(prefix))
				list.add(m_pbInstances.get(key));
		}
		
		return list;
	}
	
	public void merge()
	{
		TBReader    reader;
		TBTree      tree;
		int         treeIndex;
		String      mergeFile;
	 	PrintStream fout;
	 	DepTree     dTree;
	 	DepNode     dNode;
		
		TBHeadRules     headrules = new TBHeadRules(s_headruleFile);
		MorphEnAnalyzer morph     = (s_dictFile != null) ? new MorphEnAnalyzer(s_dictFile) : null;
		TBEnConvert     convert   = new TBEnConvert(headrules, morph, b_funcTag, b_ec, b_reverseVC);
		
		s_parseDir += File.separator;
		s_srlDir   += File.separator;
		
		ArrayList<PBInstance> list;
		
		for (String treePath : getTreePaths())
		{
			mergeFile = s_srlDir + treePath.substring(treePath.lastIndexOf(File.separator)+1) + SRL_EXT;
			reader    = new TBReader(s_parseDir + treePath + PARSE_EXT);
			fout      = IOUtil.createPrintFileStream(mergeFile);
			
			System.out.println(mergeFile);
			
			for (treeIndex=0; (tree = reader.nextTree()) != null; treeIndex++)
			{
				list = getPBInstances(treePath, treeIndex);
				if (!b_adjPred)	removeAdjectivalPredicates(tree, list);
				
				if (list.isEmpty())
				{
					dTree = convert.toDepTree(tree);
					
					for (int i=1; i<dTree.size(); i++)
					{
						dNode = dTree.get(i);
						dNode.srlInfo = new SRLInfo();
					}
				}
				else
				{
					tree.setPBLocs();
					tree.setAntecedents();
					
					for (PBInstance instance : list)
						mergeAux(instance, tree);
					
					dTree = convert.toSRLTree(tree);
				}
				
			//	fout.println(";"+treePath+" "+treeIndex);
				if (dTree.size() > n_length)	fout.println(dTree+"\n");
			}
			
			fout.close();
		}
	}
	
	protected void removeAdjectivalPredicates(TBTree tree, ArrayList<PBInstance> instances)
	{
		ArrayList<PBInstance> remove = new ArrayList<PBInstance>();
		TBNode node;
		
		for (PBInstance instance : instances)
		{
			node = tree.getNode(instance.predicateId, 1);
			if (node.isPos(TBEnLib.POS_NP) || (node.isPos(TBEnLib.POS_ADJP) && node.getParent().isPos(TBEnLib.POS_NP) && !node.isPrior(TBEnLib.POS_NP)))
				remove.add(instance);
		}
		
		instances.removeAll(remove);
	}
	
	private void mergeAux(PBInstance instance, TBTree tree)
	{
		TBNode pred = tree.getNode(instance.predicateId, 0);
		if (pred == null)
		{
			System.err.println("Wrong location of predicate: "+instance.treePath+" "+instance.treeIndex+" "+instance.predicateId);
			return;
		}
		
		ArrayList<PBArg> pbArgs  = instance.getArgs();
		ArrayList<PBArg> delArgs = new ArrayList<PBArg>();
		
		for (PBArg pbArg : pbArgs)
		{
			if (!processEmtpyCategories(pbArg, tree, pred))
				System.err.println("Wrong location in "+pbArg.label+": "+instance.toString());
		}
		
		for (PBArg pbArg : pbArgs)
		{
			if (pbArg.isLabel("rel.*"))
			{
				if (processRels(pbArg, instance.predicateId))
					delArgs.add(pbArg);
				
				continue;
			}
			
			if (pbArg.isLabel("LINK.*"))
			{
				if (!processLink(pbArgs, pbArg, tree))
					System.err.println("No-achor in "+pbArg.label+": "+instance.toString());
				
				delArgs.add(pbArg);
				continue;
			}
		}
		
		pbArgs.removeAll(delArgs);
		if (pbArgs.isEmpty())	return;
		
		if (!instance.rolesetId.endsWith(".DP"))
			pred.rolesetId = instance.rolesetId;

		for (PBArg pbArg : pbArgs)
		{
			processEmtpyCategoriesSub(pbArg, tree, pred);
			
			if (!addPBArgToTBTree(pbArg, tree))
				System.err.println("Wrong location in "+pbArg.label+": "+instance.toString());
		}
	}
	
	/** Removes <code>predId:0</code> from <code>pbArg</code>. */
	private boolean processRels(PBArg pbArg, int predId)
	{
		for (PBLoc loc : pbArg.getLocs())
		{
			if (loc.equals(predId, 0))
			{
				pbArg.removeLoc(loc);
				break;
			}
		}
		
		return pbArg.getLocs().isEmpty();
	}
	
	/** Merges LINK-argument with its anchor-argument. */
	private boolean processLink(ArrayList<PBArg> pbArgs, PBArg linkArg, TBTree tree)
	{
		PBLoc  anchor = new PBLoc(null, -1, -1);
		TBNode node;
		
		if (linkArg.isLabel("LINK-SLC"))
		{
			// find antecedent
			for (PBLoc pbLoc : linkArg.getLocs())
			{
				node = tree.getNode(pbLoc.terminalId, pbLoc.height);
				if (node == null)	return false;
				
				if (node.isPos("WH.*"))
				{
					anchor = pbLoc;
					break;
				}
			}
			
			// find antecedent in height 1
			if (anchor.terminalId == -1)
			{
				for (PBLoc pbLoc : linkArg.getLocs())
				{
					node = tree.getNode(pbLoc.terminalId, 1);
					
					if (node != null && node.isPos("WH.*"))
					{
						pbLoc.height = 1;
						anchor = pbLoc;
						break;
					}
				}
			}
		}
		else if (anchor.terminalId == -1)	// normalize empty categories
		{
			for (PBLoc pbLoc : linkArg.getLocs())
			{
				node = tree.getNode(pbLoc.terminalId, 0);
				if (node == null)	return false;
				
				if (node.isEmptyCategory())
					pbLoc.height = 0;
			}
		}
		
		for (PBArg pbArg : pbArgs)
		{
			if (!pbArg.isLabel("LINK.*") && pbArg.overlapsLocs(linkArg))
			{
				processLinkAux(linkArg, pbArg, anchor, tree);
				return true;
			}
		}
		
		for (PBArg pbArg : pbArgs)
		{
			if (!pbArg.isLabel("LINK.*") && pbArg.overlapsMildLocs(linkArg))
			{
				processLinkAux(linkArg, pbArg, anchor, tree);
				return true;
			}
		}
		
		return false;
	}
	
	private void processLinkAux(PBArg linkArg, PBArg pbArg, PBLoc anchor, TBTree tree)
	{
		TBNode node, comp;
		pbArg.putLocs(linkArg.getLocs());
		
		// find antecedents of complementizer
		if (anchor.terminalId != -1)
		{
			node = tree.getNode(anchor.terminalId, anchor.height);
			comp = node.getComplementizer();
			
			if (!comp.hasAntecedent())
			{
				Collections.sort(pbArg.getLocs());
				PBLoc anteLoc = pbArg.getLocs().get(0);
				
				comp.pbLoc.type = PBLib.PROP_OP_COMP;
				comp.antecedent = tree.getNode(anteLoc.terminalId, anteLoc.height);
			}
			else
			{
				pbArg.putLoc(comp.antecedent.pbLoc);
			}
		}
	}
	
	/** Finds empty categories' antecedents. */
	private boolean processEmtpyCategories(PBArg pbArg, TBTree tree, TBNode pred)
	{
		ArrayList<PBLoc> addLocs = new ArrayList<PBLoc>();
	//	ArrayList<PBLoc> delLocs = new ArrayList<PBLoc>();
		TBNode curr, node;
		
		for (PBLoc pbLoc : pbArg.getLocs())
		{
			curr = tree.getNode(pbLoc.terminalId, pbLoc.height);
			if (curr == null)	return false;
			
			if ((node = curr.getIncludedEmptyCategory("\\*ICH\\*.*")) != null && node.hasAntecedent())
			{
				node.antecedent.pbLoc.type = PBLib.PROP_OP_SKIP;
				addLocs.add(node.antecedent.pbLoc);
			}
			else if ((node = curr.getIncludedEmptyCategory("\\*RNR\\*.*")) != null && node.hasAntecedent())
			{
				node.antecedent.pbLoc.type = PBLib.PROP_OP_SKIP;
				addLocs.add(node.antecedent.pbLoc);
			}
			else if (curr.isEmptyCategoryRec())
			{
				do
				{
					pbLoc.height = 0;
					
					if (curr.isPhrase())
						curr = tree.getNode(pbLoc.terminalId, 0);
					
					if (curr.hasAntecedent())
					{
						pbLoc = curr.antecedent.pbLoc;
						addLocs.add(pbLoc);
					}
					else	break;
					
					curr = tree.getNode(pbLoc.terminalId, pbLoc.height);					
				}
				while (curr.isEmptyCategoryRec());
				
			/*	if (curr.isForm("\\*T\\*.*"))
				{
				//	delLocs.add(pbLoc);
					if (curr.hasAntecedent())	addLocs.add(curr.antecedent.pbLoc);
				}
				else if (curr.isForm("\\*PRO\\*.*|\\*|\\*-\\d"))
				{
					if (curr.getParent().isFollowedBy("VP"))
					{
						if (curr.hasAntecedent())
							addLocs.add(curr.antecedent.pbLoc);
					}
				//	else
				//		delLocs.add(pbLoc);
				}*/
			}
		}
		
		for (PBLoc pbLoc: addLocs)
			pbArg.putLoc(pbLoc);
		
	//	for (PBLoc pbLoc: delLocs)
	//		pbArg.removeLocs(pbLoc);
		
		trimEmptyCategories(pbArg, tree, pred);
		
		return true;
	}
	
	private void processEmtpyCategoriesSub(PBArg pbArg, TBTree tree, TBNode pred)
	{
		ArrayList<SRLHead> heads;
 		TBNode curr, node, term;
		
		for (PBLoc pbLoc : pbArg.getLocs())
		{
			curr = tree.getNode(pbLoc.terminalId, pbLoc.height);
			
			if (curr.isPos("S"))
			{
				for (TBNode child : curr.getChildren())
				{
					if (child.isEmptyCategoryRec() && (node = child.getIncludedEmptyCategory("\\*|\\*-\\d")) != null && node.hasAntecedent() && node.terminalId > pred.terminalId)
					{
						node = node.antecedent;
						term = tree.getTerminalNode(node.pbLoc.terminalId);
						if (term.isPos("EX") || term.form.equalsIgnoreCase("it"))	return;
						
						if ((heads = node.getPBHeads()) != null)
						{
							for (SRLHead head : heads)
							{
								if (head.equals(pred.terminalId+1))
									return;
							}					
						}
						
						node.pbLoc.type = "";
						pbArg.putLoc(node.pbLoc);
						
						return;
					}
				}
			}
		}
	}
	
	void trimEmptyCategories(PBArg pbArg, TBTree tree, TBNode pred)
	{
		ArrayList<PBLoc> pbLocs  = pbArg.getLocs();
		ArrayList<PBLoc> delLocs = new ArrayList<PBLoc>();
		Collections.sort(pbLocs);

		TBNode ante = tree.getNode(pbLocs.get(0));
		
		if (ante.isEmptyCategoryRec())
		{
			ante.antecedent = null;
			ante = null;
		}
		else
		{
			ante.pbLoc.type = PBLib.PROP_OP_ANTE;
		}
		
		boolean isFound = false;
		PBLoc pbLoc;
		TBNode curr;
		
		for (int i=pbLocs.size()-1; i>=0; i--)
		{
			pbLoc = pbLocs.get(i);
			curr  = tree.getNode(pbLoc);
			if (!curr.isEmptyCategoryRec()) continue;
            
			if (curr.isPhrase())
				curr = tree.getNode(pbLoc.terminalId, 0);
			
			if (curr.isForm("\\*PRO\\*.*|\\*|\\*-\\d"))
			{
				if (!isFound && pbLoc.terminalId < pred.terminalId)
				{
					isFound = true;
					curr.antecedent = ante;
				}
				else	delLocs.add(pbLoc);
			}
		}
        
		if (isFound)	pbLocs.removeAll(delLocs);
	}
	
	protected boolean isCyclic(ArrayList<PBInstance> pbInstances, TBTree tree)
	{
		ArrayList<IntOpenHashSet> list = new ArrayList<IntOpenHashSet>();
		TBNode node;
		
		for (PBInstance instance : pbInstances)
		{
			IntOpenHashSet set = new IntOpenHashSet();
			
			for (PBArg pbArg : instance.getArgs())
			{
				for (PBLoc pbLoc : pbArg.getLocs())
				{
					node = tree.getNode(pbLoc);
					set.addAll(node.getSubTermainlIDs());
				}
			}
			
			list.add(set);
		}
		
		int i, j, size = pbInstances.size();
		
		for (i=0; i<size; i++)
		{
			PBInstance iInstance = pbInstances.get(i);
			IntOpenHashSet  iSet = list.get(i);
			
			for (j=i+1; j<size; j++)
			{
				PBInstance jInstance = pbInstances.get(j);
				IntOpenHashSet  jSet = list.get(j);
				
				if (iSet.contains(jInstance.predicateId) && jSet.contains(iInstance.predicateId))
					return true;
			}
		}
		
		return false;
	}
	
	private boolean addPBArgToTBTree(PBArg pbArg, TBTree tree)
	{
		ArrayList<PBLoc> pbLocs = pbArg.getLocs();
		IntOpenHashSet   addIDs = new IntOpenHashSet();
		IntOpenHashSet   delIDs = new IntOpenHashSet();
		PBLoc            rLoc   = null;
		TBNode           node, tmp;
		String label;
		
		if (pbArg.label.matches("rel.*"))	label = "C-V";
		else								label = "A"+pbArg.label.substring(3);
		
		// retrieve all terminal IDs
		for (PBLoc pbLoc : pbLocs)
		{
			if ((node = tree.getNode(pbLoc)) == null)
				return false;
			
			tmp = tree.getTerminalNode(pbLoc.terminalId);

			if (pbLoc.isType(PBLib.PROP_OP_SKIP))
				delIDs.addAll(node.getSubTermainlIDs());
			
			if (node.isPos("WH.*"))
				rLoc = pbLoc;
			else if (node.isEmptyCategoryRec() && tmp.form.startsWith(TBEnLib.EC_PRO) && pbLoc.terminalId > pbArg.predicateId)
				continue;
			else
				addIDs.addAll(node.getSubTermainlIDs());
		}
		
		if (addIDs.isEmpty())	return true;
		
		// add terminal IDs
	/*	int[] ids = addIDs.toArray();
		Arrays.sort(ids);
		
		TBNode pred = tree.getTerminalNode(pbArg.predicateId);
		pred.addPBArg(new SRLArg(label, ids));*/
		
		// add each argument
		addIDs.removeAll(delIDs);
		int terminalId, height;
		String prefix = "";
		int[] ids;
		
		while (!addIDs.isEmpty())
		{
			height = 0;
			ids = addIDs.toArray();
			Arrays.sort(ids);
			terminalId = ids[0];
			
			while (true)
			{
				node = tree.getNode(terminalId, height+1);
				
				if (node == null || !JSet.isSubset(addIDs, node.getSubTermainlIDs()))
				{
					node = tree.getNode(terminalId, height);
					
					if (node.isEmptyCategoryRec())
						node.addPBHead(new SRLHead(pbArg.predicateId+1, label));
					else if (!TBEnLib.isPunctuation(node.pos))
						node.addPBHead(new SRLHead(pbArg.predicateId+1, prefix+label));
					
					addIDs.removeAll(node.getSubTermainlIDs());
					if (!node.isEmptyCategoryRec() && !TBEnLib.isPunctuation(node.pos) && !label.startsWith("AM"))
						prefix = "C-";
					
					break;
				}
				
				height++;
			}
		}
		
		prefix = "R-";
		
		if (rLoc != null)
		{
			node = tree.getNode(rLoc);
		//	ids  = node.getSubTermainlIDs().toArray();
		//	Arrays.sort(ids);
		//	pred.addPBArg(new SRLArg(prefix+label, ids));
			node.addPBHead(new SRLHead(pbArg.predicateId+1, prefix+label));
		}
		
		return true;
	}
	
	static public void main(String[] args)
	{
		new PropToDep(args);
	}
}
